import pandas as pd
import numpy as np
import seaborn as sns
import pylab as py
%pylab inline
Populating the interactive namespace from numpy and matplotlib
cdf=pd.read_csv('train (1).csv')
cdf.head()
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
cdf.sample(10)
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 265 | 266 | 0 | 2 | Reeves, Mr. David | male | 36.0 | 0 | 0 | C.A. 17248 | 10.5000 | NaN | S |
| 571 | 572 | 1 | 1 | Appleton, Mrs. Edward Dale (Charlotte Lamson) | female | 53.0 | 2 | 0 | 11769 | 51.4792 | C101 | S |
| 225 | 226 | 0 | 3 | Berglund, Mr. Karl Ivar Sven | male | 22.0 | 0 | 0 | PP 4348 | 9.3500 | NaN | S |
| 808 | 809 | 0 | 2 | Meyer, Mr. August | male | 39.0 | 0 | 0 | 248723 | 13.0000 | NaN | S |
| 17 | 18 | 1 | 2 | Williams, Mr. Charles Eugene | male | NaN | 0 | 0 | 244373 | 13.0000 | NaN | S |
| 153 | 154 | 0 | 3 | van Billiard, Mr. Austin Blyler | male | 40.5 | 0 | 2 | A/5. 851 | 14.5000 | NaN | S |
| 56 | 57 | 1 | 2 | Rugg, Miss. Emily | female | 21.0 | 0 | 0 | C.A. 31026 | 10.5000 | NaN | S |
| 68 | 69 | 1 | 3 | Andersson, Miss. Erna Alexandra | female | 17.0 | 4 | 2 | 3101281 | 7.9250 | NaN | S |
| 585 | 586 | 1 | 1 | Taussig, Miss. Ruth | female | 18.0 | 0 | 2 | 110413 | 79.6500 | E68 | S |
| 486 | 487 | 1 | 1 | Hoyt, Mrs. Frederick Maxfield (Jane Anne Forby) | female | 35.0 | 1 | 0 | 19943 | 90.0000 | C93 | S |
cdf.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 891 non-null int64 1 Survived 891 non-null int64 2 Pclass 891 non-null int64 3 Name 891 non-null object 4 Sex 891 non-null object 5 Age 714 non-null float64 6 SibSp 891 non-null int64 7 Parch 891 non-null int64 8 Ticket 891 non-null object 9 Fare 891 non-null float64 10 Cabin 204 non-null object 11 Embarked 889 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 83.7+ KB
cdf.isnull().sum()
PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 177 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 687 Embarked 2 dtype: int64
fig=py.figure(figsize=(15,15))
sns.heatmap(cdf.isnull(),yticklabels=False,)
<AxesSubplot:>
cdf.head()
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
sns.set(style='darkgrid')
sns.countplot(x='Survived',data=cdf)
<AxesSubplot:xlabel='Survived', ylabel='count'>
sns.countplot(x='Survived',data=cdf,hue='Pclass')
<AxesSubplot:xlabel='Survived', ylabel='count'>
sns.displot(cdf['Age'],height=8,bins=30)
<seaborn.axisgrid.FacetGrid at 0x22d5ab3f9a0>
cdf.head()
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
sns.countplot(x='SibSp',data=cdf)
<AxesSubplot:xlabel='SibSp', ylabel='count'>
cdf['Fare'].plot(kind='hist',bins=30,)
<AxesSubplot:ylabel='Frequency'>
import cufflinks as cf
cf.go_offline(connected=False)
cdf['Fare'].iplot(kind='hist')
cdf.head()
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
sns.boxplot(x='Pclass',y='Age',data=cdf)
<AxesSubplot:xlabel='Pclass', ylabel='Age'>
print(cdf[cdf['Pclass']==1]['Age'].mean())
print('\n')
print(cdf[cdf['Pclass']==2]['Age'].mean())
print('\n')
print(cdf[cdf['Pclass']==3]['Age'].mean())
38.233440860215055 29.87763005780347 25.14061971830986
def impute(new_age):
Age=new_age[0]
Pclass=new_age[1]
if pd.isnull(Age):
if Pclass==1:
return int(cdf[cdf['Pclass']==1]['Age'].mean())
elif Pclass==2:
return int(cdf[cdf['Pclass']==2]['Age'].mean())
else:
return int(cdf[cdf['Pclass']==3]['Age'].mean())
else:
return Age
int(cdf[cdf['Pclass']==1]['Age'].mean())
38
cdf['new_age']=cdf[['Age','Pclass']].apply(impute, axis=1)
cdf
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | new_age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 22.0 |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 38.0 |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 26.0 |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 35.0 |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 35.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S | 27.0 |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S | 19.0 |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S | 25.0 |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C | 26.0 |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q | 32.0 |
891 rows × 13 columns
c=cdf['Name'].loc[1]
c
'Cumings, Mrs. John Bradley (Florence Briggs Thayer)'
c.
'Mrs.'
def tit(le):
return le.split(' ')[1].split(' ')[0]
cdf['title']=cdf['Name'].apply(tit)
cdf.groupby(by='title').count()['new_age'].plot()
<AxesSubplot:xlabel='title'>
cdf.head()
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | new_age | title | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 22.0 | Mr. |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 38.0 | Mrs. |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 26.0 | Miss. |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 35.0 | Mrs. |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 35.0 | Mr. |
cdf=cdf.drop(['PassengerId','Name','Age','Ticket','Cabin'],axis=1)
cdf
| Survived | Pclass | Sex | SibSp | Parch | Fare | Embarked | new_age | title | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | male | 1 | 0 | 7.2500 | S | 22.0 | Mr. |
| 1 | 1 | 1 | female | 1 | 0 | 71.2833 | C | 38.0 | Mrs. |
| 2 | 1 | 3 | female | 0 | 0 | 7.9250 | S | 26.0 | Miss. |
| 3 | 1 | 1 | female | 1 | 0 | 53.1000 | S | 35.0 | Mrs. |
| 4 | 0 | 3 | male | 0 | 0 | 8.0500 | S | 35.0 | Mr. |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 0 | 2 | male | 0 | 0 | 13.0000 | S | 27.0 | Rev. |
| 887 | 1 | 1 | female | 0 | 0 | 30.0000 | S | 19.0 | Miss. |
| 888 | 0 | 3 | female | 1 | 2 | 23.4500 | S | 25.0 | Miss. |
| 889 | 1 | 1 | male | 0 | 0 | 30.0000 | C | 26.0 | Mr. |
| 890 | 0 | 3 | male | 0 | 0 | 7.7500 | Q | 32.0 | Mr. |
891 rows × 9 columns
cd=pd.get_dummies(cdf,drop_first=True)
cd.head()
| Survived | Pclass | SibSp | Parch | Fare | new_age | Sex_male | Embarked_Q | Embarked_S | title_Capt. | ... | title_Pelsmaeker, | title_Planke, | title_Rev. | title_Shawah, | title_Steen, | title_Velde, | title_Walle, | title_der | title_the | title_y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | 1 | 0 | 7.2500 | 22.0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 1 | 1 | 1 | 0 | 71.2833 | 38.0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 1 | 3 | 0 | 0 | 7.9250 | 26.0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 1 | 1 | 1 | 0 | 53.1000 | 35.0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 0 | 3 | 0 | 0 | 8.0500 | 35.0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 39 columns
from sklearn.ensemble import ExtraTreesClassifier
cd.head()
| Survived | Pclass | SibSp | Parch | Fare | new_age | Sex_male | Embarked_Q | Embarked_S | title_Capt. | ... | title_Pelsmaeker, | title_Planke, | title_Rev. | title_Shawah, | title_Steen, | title_Velde, | title_Walle, | title_der | title_the | title_y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | 1 | 0 | 7.2500 | 22.0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 1 | 1 | 1 | 0 | 71.2833 | 38.0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 1 | 3 | 0 | 0 | 7.9250 | 26.0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 1 | 1 | 1 | 0 | 53.1000 | 35.0 | 0 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 0 | 3 | 0 | 0 | 8.0500 | 35.0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 39 columns
X=cd.drop('Survived',axis=1)
y=cd['Survived']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)
from sklearn.linear_model import LogisticRegression
logmodel=LogisticRegression(max_iter=500)
logmodel.fit(X_train,y_train)
LogisticRegression(max_iter=500)
predict=logmodel.predict(X_test)
from sklearn.metrics import classification_report , confusion_matrix
print(classification_report(y_test,predict))
precision recall f1-score support
0 0.82 0.92 0.87 169
1 0.87 0.73 0.79 126
accuracy 0.84 295
macro avg 0.84 0.82 0.83 295
weighted avg 0.84 0.84 0.83 295
print(confusion_matrix(y_test,predict))
[[155 14] [ 34 92]]
sns.heatmap(confusion_matrix(y_test,predict),annot=True)
<AxesSubplot:>